#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
suppressWarnings(suppressMessages(library(WGCNA)))
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)
# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Clusterings
clusterings = read_csv('./../Data/Gandal/clusters.csv')
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
rm(DE_info, GO_annotations, clusterings)
Using the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.
datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)
# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)
# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)
# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated'))
}
rm(ME_object)
I’m going to select all the modules that have an absolute correlation higher than 0.9 with Diagnosis to study them
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]
# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)
labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = gsub('ME','',rownames(moduleTraitCor)),
yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
main = paste('Module-Trait relationships'))
diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
'MTcor' = moduleTraitCor[,'Diagnosis'],
'MTpval' = moduleTraitPvalue[,'Diagnosis'])
genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')
rm(moduleTraitPvalue, datTraits, textMatrix, diagnosis_cor)
top_modules = gsub('ME','',rownames(moduleTraitCor)[abs(moduleTraitCor[,'Diagnosis'])>0.9])
cat(paste0('Top modules selected: ', paste(top_modules, collapse=', '),'\n'))
## Top modules selected: #9B8EFF, #F066EA, #8DAB00
pca = datExpr %>% prcomp
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
alpha = ifelse(ImportantModules=='Others', 0.1, 0.3))
table(plot_data$ImportantModules)
##
## #8DAB00 #9B8EFF #F066EA Others
## 1277 215 1158 13950
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) +
geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=ID)) + theme_minimal() +
ggtitle('Modules with strongest relation to Diagnosis'))
rm(pca)
create_plot = function(module){
plot_data = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module)
colnames(plot_data)[2] = 'Module'
SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='None'])))
p = ggplotly(plot_data %>% ggplot(aes(Module, GS, color=gene.score)) + geom_point(alpha=0.5, aes(ID=ID)) +
scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,8))) + theme_minimal() +
ggtitle(paste0('Module ', module,' (MTcor = ', round(moduleTraitCor[paste0('ME',module),1],2),')')))
return(p)
}
create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)
Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but the plot shows this relation in modules with correlations close to 0.
plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>%
group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>%
left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>%
mutate(p=round(p/n*100,2))
for(i in 1:nrow(plot_data)){
this_row = plot_data[i,]
if(this_row$hasSFARIscore==FALSE & this_row$p==100){
new_row = this_row
new_row$hasSFARIscore = TRUE
new_row$p = 0
plot_data = plot_data %>% rbind(new_row)
}
}
plot_data = plot_data %>% filter(hasSFARIscore==TRUE)
ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
geom_point(color=plot_data$Module, aes(id=Module)) +
xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
theme_minimal() + theme(legend.position = 'none'))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
rm(i, this_row, new_row, plot_data)
Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance
*Ordered by \(\frac{MM+|GS|}{2}\)
Only two SFARI Genes in the three lists, none from scores 1 and 2…
create_table = function(module){
top_genes = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module) %>%
rename('MM' = paste0('MM.',gsub('#','',module))) %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
top_n(10)
return(top_genes)
}
top_genes_1 = create_table(top_modules[1])
kable(top_genes_1, caption=paste0('Top 10 genes for module ', top_modules[1], ' (MTcor = ',
round(moduleTraitCor[paste0('ME',top_modules[1]),1],2),')'))
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000143384 | 0.8768314 | 0.9060349 | None | 0.8914332 |
| ENSG00000161638 | 0.8613780 | 0.8646717 | None | 0.8630249 |
| ENSG00000183864 | 0.7916638 | 0.8719542 | None | 0.8318090 |
| ENSG00000196935 | 0.7680729 | 0.8951553 | None | 0.8316141 |
| ENSG00000150907 | 0.7883168 | 0.8458336 | None | 0.8170752 |
| ENSG00000148841 | 0.8218756 | 0.8042687 | None | 0.8130721 |
| ENSG00000150457 | 0.7719291 | 0.8433350 | None | 0.8076321 |
| ENSG00000101493 | 0.7243000 | 0.8764275 | None | 0.8003638 |
| ENSG00000148498 | 0.6952386 | 0.8949607 | None | 0.7950996 |
| ENSG00000120278 | 0.7711860 | 0.8073816 | None | 0.7892838 |
top_genes_2 = create_table(top_modules[2])
kable(top_genes_2, caption=paste0('Top 10 genes for module ', top_modules[2], ' (MTcor = ',
round(moduleTraitCor[paste0('ME',top_modules[2]),1],2),')'))
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000180573 | 0.8413858 | 0.8648084 | None | 0.8530971 |
| ENSG00000181722 | 0.8640272 | 0.8406765 | 3 | 0.8523519 |
| ENSG00000003402 | 0.8807946 | 0.8010251 | None | 0.8409098 |
| ENSG00000184678 | 0.7903735 | 0.8878029 | None | 0.8390882 |
| ENSG00000120690 | 0.7921926 | 0.7836121 | None | 0.7879024 |
| ENSG00000107104 | 0.8162030 | 0.7554489 | 4 | 0.7858260 |
| ENSG00000084093 | 0.8091856 | 0.7472778 | None | 0.7782317 |
| ENSG00000198879 | 0.8039142 | 0.7457312 | None | 0.7748227 |
| ENSG00000198185 | 0.7958118 | 0.7482581 | None | 0.7720349 |
| ENSG00000051620 | 0.7890904 | 0.7462823 | None | 0.7676863 |
top_genes_3 = create_table(top_modules[3])
kable(top_genes_3, caption=paste0('Top 10 genes for module ', top_modules[3],' (MTcor = ',
round(moduleTraitCor[paste0('ME',top_modules[3]),1],2),')'))
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000050748 | 0.9014380 | -0.9399810 | None | 0.9207095 |
| ENSG00000108395 | 0.9067664 | -0.9250494 | None | 0.9159079 |
| ENSG00000138078 | 0.8923376 | -0.8966280 | None | 0.8944828 |
| ENSG00000177432 | 0.8590945 | -0.8915541 | None | 0.8753243 |
| ENSG00000163577 | 0.8377986 | -0.9116043 | None | 0.8747015 |
| ENSG00000176490 | 0.8551447 | -0.8936753 | None | 0.8744100 |
| ENSG00000155097 | 0.8548700 | -0.8885175 | None | 0.8716938 |
| ENSG00000128881 | 0.8740293 | -0.8654230 | None | 0.8697261 |
| ENSG00000171132 | 0.8345108 | -0.9028094 | None | 0.8686601 |
| ENSG00000114573 | 0.8277575 | -0.9068579 | None | 0.8673077 |
rm(create_table)
pca = datExpr %>% prcomp
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
mutate(color = ifelse(Module %in% top_modules, as.character(Module), 'gray')) %>%
mutate(alpha = ifelse(color %in% top_modules &
ID %in% c(as.character(top_genes_1$ID),
as.character(top_genes_2$ID),
as.character(top_genes_3$ID)), 1, 0.1))
plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) +
theme_minimal() + ggtitle('Important genes identified through WGCNA')
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] WGCNA_1.68 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [4] knitr_1.24 doParallel_1.0.15 iterators_1.0.12
## [7] foreach_1.4.7 polycor_0.7-10 expss_0.10.1
## [10] GGally_1.4.0 gridExtra_2.3 viridis_0.5.1
## [13] viridisLite_0.3.0 RColorBrewer_1.1-2 dendextend_1.13.2
## [16] plotly_4.9.1 glue_1.3.1 reshape2_1.4.3
## [19] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3
## [22] purrr_0.3.3 readr_1.3.1 tidyr_1.0.0
## [25] tibble_2.1.3 ggplot2_3.2.1 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5
## [3] Hmisc_4.2-0 plyr_1.8.5
## [5] lazyeval_0.2.2 splines_3.6.0
## [7] crosstalk_1.0.0 BiocParallel_1.20.1
## [9] GenomeInfoDb_1.22.0 robust_0.4-18.2
## [11] digest_0.6.23 htmltools_0.4.0
## [13] GO.db_3.10.0 fansi_0.4.1
## [15] magrittr_1.5 checkmate_1.9.4
## [17] memoise_1.1.0 fit.models_0.5-14
## [19] cluster_2.0.8 annotate_1.64.0
## [21] modelr_0.1.5 matrixStats_0.55.0
## [23] colorspace_1.4-1 blob_1.2.0
## [25] rvest_0.3.5 rrcov_1.4-7
## [27] haven_2.2.0 xfun_0.8
## [29] crayon_1.3.4 RCurl_1.95-4.12
## [31] jsonlite_1.6 genefilter_1.68.0
## [33] zeallot_0.1.0 impute_1.60.0
## [35] survival_2.44-1.1 gtable_0.3.0
## [37] zlibbioc_1.32.0 XVector_0.26.0
## [39] DelayedArray_0.12.2 BiocGenerics_0.32.0
## [41] DEoptimR_1.0-8 scales_1.1.0
## [43] mvtnorm_1.0-11 DBI_1.1.0
## [45] Rcpp_1.0.3 xtable_1.8-4
## [47] htmlTable_1.13.1 foreign_0.8-71
## [49] bit_1.1-15.1 preprocessCore_1.48.0
## [51] Formula_1.2-3 stats4_3.6.0
## [53] htmlwidgets_1.5.1 httr_1.4.1
## [55] acepack_1.4.1 farver_2.0.3
## [57] pkgconfig_2.0.3 reshape_0.8.8
## [59] XML_3.98-1.20 nnet_7.3-12
## [61] dbplyr_1.4.2 locfit_1.5-9.1
## [63] later_1.0.0 tidyselect_0.2.5
## [65] labeling_0.3 rlang_0.4.2
## [67] AnnotationDbi_1.48.0 munsell_0.5.0
## [69] cellranger_1.1.0 tools_3.6.0
## [71] cli_2.0.1 generics_0.0.2
## [73] RSQLite_2.2.0 broom_0.5.3
## [75] fastmap_1.0.1 evaluate_0.14
## [77] yaml_2.2.0 bit64_0.9-7
## [79] fs_1.3.1 robustbase_0.93-5
## [81] nlme_3.1-139 mime_0.8
## [83] xml2_1.2.2 compiler_3.6.0
## [85] rstudioapi_0.10 reprex_0.3.0
## [87] geneplotter_1.64.0 pcaPP_1.9-73
## [89] stringi_1.4.5 highr_0.8
## [91] lattice_0.20-38 Matrix_1.2-17
## [93] vctrs_0.2.1 pillar_1.4.3
## [95] lifecycle_0.1.0 data.table_1.12.8
## [97] bitops_1.0-6 httpuv_1.5.2
## [99] GenomicRanges_1.38.0 R6_2.4.1
## [101] latticeExtra_0.6-28 promises_1.1.0
## [103] IRanges_2.20.2 codetools_0.2-16
## [105] MASS_7.3-51.4 assertthat_0.2.1
## [107] SummarizedExperiment_1.16.1 DESeq2_1.26.0
## [109] withr_2.1.2 S4Vectors_0.24.2
## [111] GenomeInfoDbData_1.2.2 hms_0.5.3
## [113] grid_3.6.0 rpart_4.1-15
## [115] rmarkdown_1.14 Cairo_1.5-10
## [117] shiny_1.4.0 Biobase_2.46.0
## [119] lubridate_1.7.4 base64enc_0.1-3